full_data <- readRDS('data/full_data.rds')
H2S_dm <- full_data %>%
group_by(day, Monitor) %>%
summarise(H2S_daily_max = max(H2S, na.rm = TRUE)) %>%
mutate(H2S_daily_max = if_else(H2S_daily_max == -Inf, NA, H2S_daily_max))
## Warning: There were 501 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `H2S_daily_max = max(H2S, na.rm = TRUE)`.
## ℹ In group 1215: `day = 2020-07-12`, `Monitor = "First Methodist"`.
## Caused by warning in `max()`:
## ! no non-missing arguments to max; returning -Inf
## ℹ Run ]8;;ide:run:dplyr::last_dplyr_warnings()dplyr::last_dplyr_warnings()]8;; to see the 500 remaining warnings.
## `summarise()` has grouped output by 'day'. You can override using the `.groups`
## argument.
H2S_ma <- H2S_dm %>%
mutate(yearmonth = format(day, "%Y-%m")) %>%
group_by(yearmonth, Monitor) %>%
summarise(H2S_monthly_average = mean(H2S_daily_max, na.rm = TRUE))
## `summarise()` has grouped output by 'yearmonth'. You can override using the
## `.groups` argument.
H2S_dm_graph <- H2S_dm %>%
ggplot(aes(x=day, y=H2S_daily_max, group=Monitor, color=Monitor)) +
geom_line() +
labs(title = "Daily Max H2S concentration by monitor", y = 'Daily Max H2S', x = 'time') +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45))
ggplotly(H2S_dm_graph) %>% layout(dragmode = 'pan')
H2S_dm_graph_b <- H2S_dm %>%
ggplot(aes(x=day, y=H2S_daily_max, group=Monitor, color=Monitor)) +
geom_line() +
labs(title = "Daily Max H2S concentration by monitor w/o outlier", y = 'Daily Max H2S', x = 'time') +
scale_y_continuous(limits = c(0, 100)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45))
ggplotly(H2S_dm_graph_b) %>% layout(dragmode = 'pan')
H2S_ma_graph <- H2S_ma %>%
ggplot(aes(x=yearmonth, y=H2S_monthly_average, group=Monitor, color=Monitor)) +
geom_line() +
labs(title = "Monthly Average H2S concentration by monitor", y = 'Monthly Average H2S', x = 'time') +
scale_x_discrete(breaks = unique(H2S_ma$yearmonth)[seq(1, length(unique(H2S_ma$yearmonth)), by = 6)]) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45))
ggplotly(H2S_ma_graph) %>% layout(dragmode = 'pan')
H2S_ma_graph_b <- H2S_ma %>%
ggplot(aes(x=yearmonth, y=H2S_monthly_average, group=Monitor, color=Monitor)) +
geom_line() +
labs(title = "Monthly Average H2S concentration by monitor w/o outlier", y = 'Monthly Average H2S', x = 'time') +
scale_x_discrete(breaks = unique(H2S_ma$yearmonth)[seq(1, length(unique(H2S_ma$yearmonth)), by = 6)]) +
scale_y_continuous(limits=c(0, 50)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45))
ggplotly(H2S_ma_graph_b) %>% layout(dragmode = 'pan')
The line with the largest deviation/peak corresponds to the 213th&Chico monitor, in 2021-10 Maybe we should start from January 2022 to start modelling to remove the extreme values, or simply removing the 213 monitor since that’s set up specifically for the event.
full_data %>%
polarPlot(pollutant = "H2S", col = "turbo",
key.position = "bottom",
key.header = "mean H2S",
key.footer = NULL, title = 'hi')
# Compute daily average wd/ws
daily_full <- full_data %>%
select(c(H2S, ws, wd, latitude, longitude, Monitor, MinDist, Converted_Angle, Refinery, year, month, day)) %>%
group_by(Monitor, day, latitude, longitude, Refinery, Converted_Angle, year, month, MinDist) %>%
summarise(H2S_daily_max = max(H2S, na.rm=TRUE),
H2S_daily_avg = mean(H2S, na.rm=TRUE),
ws_avg = mean(ws, na.rm=TRUE),
wd_avg = mean(wd, na.rm=TRUE)) %>%
mutate(H2S_daily_max = if_else(H2S_daily_max == -Inf, NA, H2S_daily_max))
## Warning: There were 3681 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `H2S_daily_max = max(H2S, na.rm = TRUE)`.
## ℹ In group 453: `Monitor = "First Methodist"`, `day = 2020-07-09`, `latitude =
## NA`, `longitude = NA`, `Refinery = "Phillips 66 (Wilmington)"`,
## `Converted_Angle = 209`, `year = "2020"`, `month = "07"`, `MinDist =
## 2119.191`.
## Caused by warning in `max()`:
## ! no non-missing arguments to max; returning -Inf
## ℹ Run ]8;;ide:run:dplyr::last_dplyr_warnings()dplyr::last_dplyr_warnings()]8;; to see the 3680 remaining warnings.
## `summarise()` has grouped output by 'Monitor', 'day', 'latitude', 'longitude',
## 'Refinery', 'Converted_Angle', 'year', 'month'. You can override using the
## `.groups` argument.
h2s_model_a <- gam(H2S~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2)+Refinery, data = full_data)
summary(h2s_model_a)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + wd_sec + ws +
## I(1/MinDist^2) + Refinery
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.091e-01 1.084e+00 0.839 0.4016
## year2021 3.076e-01 1.082e+00 0.284 0.7763
## year2022 -7.302e-01 1.083e+00 -0.675 0.5000
## wd_secQ2 8.995e-02 4.755e-02 1.892 0.0585 .
## wd_secQ3 8.260e-01 5.171e-02 15.974 < 2e-16 ***
## wd_secQ4 4.469e-01 4.417e-02 10.118 < 2e-16 ***
## ws -9.682e-02 5.944e-03 -16.290 < 2e-16 ***
## I(1/MinDist^2) 2.222e+04 3.730e+04 0.596 0.5514
## RefineryMarathon (Carson) 2.989e+00 5.907e-02 50.603 < 2e-16 ***
## RefineryMarathon (Wilmington) 8.388e-01 6.456e-02 12.993 < 2e-16 ***
## RefineryPhillips 66 (Wilimington) -5.341e-01 5.964e-02 -8.955 < 2e-16 ***
## RefineryPhillips 66 (Wilmington) 3.880e-01 6.332e-02 6.127 8.94e-10 ***
## RefineryTorrance Refinery 8.476e-02 6.389e-02 1.327 0.1846
## RefineryValero Refinery 5.988e-01 6.286e-02 9.526 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 7.996 8 881.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.00848 Deviance explained = 0.849%
## GCV = 336.53 Scale est. = 336.52 n = 1730935
plot(h2s_model_a)
h2s_model_b <- gam(H2S~s(as.numeric(month),bs='cc')+wd_sec+ws+I(1/MinDist^2)+
Refinery+s(latitude, longitude, bs='tp', k = 5),
data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_b)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + wd_sec + ws + I(1/MinDist^2) +
## Refinery + s(latitude, longitude, bs = "tp", k = 5)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.3701574 0.0568743 59.256 < 2e-16 ***
## wd_secQ2 -0.1223242 0.0032107 -38.098 < 2e-16 ***
## wd_secQ3 -0.0276010 0.0035146 -7.853 4.06e-15 ***
## wd_secQ4 0.0497316 0.0030516 16.297 < 2e-16 ***
## ws -0.0379057 0.0003735 -101.491 < 2e-16 ***
## I(1/MinDist^2) 0.0005294 0.0000120 44.128 < 2e-16 ***
## RefineryMarathon (Carson) -2.3029517 0.0477069 -48.273 < 2e-16 ***
## RefineryMarathon (Wilmington) -2.7010960 0.0622682 -43.378 < 2e-16 ***
## RefineryPhillips 66 (Wilimington) -3.5993158 0.0805031 -44.710 < 2e-16 ***
## RefineryPhillips 66 (Wilmington) -3.2913878 0.0854036 -38.539 < 2e-16 ***
## RefineryTorrance Refinery -0.2786949 0.0148328 -18.789 < 2e-16 ***
## RefineryValero Refinery -3.3474680 0.0792409 -42.244 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 7.998 8 1740 <2e-16 ***
## s(latitude,longitude) 2.000 2 2368 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 23/24
## R-sq.(adj) = 0.102 Deviance explained = 10.2%
## GCV = 0.64572 Scale est. = 0.6457 n = 769068
plot(h2s_model_b)
h2s_dm_model_a <- gam(H2S_daily_max~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2)+Refinery, data = full_data)
summary(h2s_dm_model_a)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + year + wd_sec +
## ws + I(1/MinDist^2) + Refinery
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.024e+00 5.659e+00 0.888 0.37466
## year2021 1.330e+00 5.651e+00 0.235 0.81393
## year2022 -8.367e+00 5.652e+00 -1.480 0.13878
## wd_secQ2 -1.341e+00 2.445e-01 -5.484 4.16e-08 ***
## wd_secQ3 3.643e+00 2.663e-01 13.679 < 2e-16 ***
## wd_secQ4 -2.626e-01 2.275e-01 -1.154 0.24832
## ws 4.156e-02 3.054e-02 1.361 0.17364
## I(1/MinDist^2) 9.242e+04 1.926e+05 0.480 0.63123
## RefineryMarathon (Carson) 2.569e+01 3.040e-01 84.515 < 2e-16 ***
## RefineryMarathon (Wilmington) 4.007e+00 3.327e-01 12.043 < 2e-16 ***
## RefineryPhillips 66 (Wilimington) -8.985e-01 3.075e-01 -2.922 0.00348 **
## RefineryPhillips 66 (Wilmington) 3.154e+00 3.267e-01 9.656 < 2e-16 ***
## RefineryTorrance Refinery 8.734e-01 3.292e-01 2.653 0.00799 **
## RefineryValero Refinery 5.897e+00 3.242e-01 18.185 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 7.998 8 2750 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0229 Deviance explained = 2.3%
## GCV = 9173.7 Scale est. = 9173.6 n = 1777112
plot(h2s_dm_model_a)
h2s_dm_model_b <- gam(H2S_daily_max~s(as.numeric(month),bs='cc')+wd_sec+ws+
I(1/MinDist^2)+Refinery+s(latitude, longitude, bs='tp', k=5),
data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_b)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + wd_sec + ws +
## I(1/MinDist^2) + Refinery + s(latitude, longitude, bs = "tp",
## k = 5)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.556e+01 4.284e-01 36.324 < 2e-16 ***
## wd_secQ2 -3.802e-01 1.199e-02 -31.723 < 2e-16 ***
## wd_secQ3 -2.731e-01 1.312e-02 -20.813 < 2e-16 ***
## wd_secQ4 -3.293e-02 1.139e-02 -2.891 0.00384 **
## ws -6.881e-02 1.394e-03 -49.358 < 2e-16 ***
## I(1/MinDist^2) 2.630e-03 8.341e-05 31.531 < 2e-16 ***
## RefineryMarathon (Carson) -1.115e+01 4.713e-01 -23.652 < 2e-16 ***
## RefineryMarathon (Wilmington) -1.387e+01 4.938e-01 -28.095 < 2e-16 ***
## RefineryPhillips 66 (Wilimington) -1.860e+01 5.276e-01 -35.248 < 2e-16 ***
## RefineryPhillips 66 (Wilmington) -1.830e+01 5.416e-01 -33.783 < 2e-16 ***
## RefineryTorrance Refinery -2.225e+00 3.238e-01 -6.873 6.31e-12 ***
## RefineryValero Refinery -1.383e+01 5.266e-01 -26.269 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 7.965 8 2801 <2e-16 ***
## s(latitude,longitude) 2.000 2 1573 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 23/24
## R-sq.(adj) = 0.256 Deviance explained = 25.6%
## GCV = 8.9977 Scale est. = 8.9974 n = 769068
plot(h2s_dm_model_b)
# Also include angle to refinery
h2s_dm_model_c <- gam(H2S_daily_max~s(as.numeric(month),bs='cc')+wd_avg+ws_avg+
I(1/MinDist^2)+Refinery+Converted_Angle+
s(latitude, longitude, bs='tp', k=5),
data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_c)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + wd_avg + ws_avg +
## I(1/MinDist^2) + Refinery + Converted_Angle + s(latitude,
## longitude, bs = "tp", k = 5)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.212e+01 3.794e+00 5.830 6.18e-09 ***
## wd_avg 4.190e-03 1.536e-03 2.727 0.006426 **
## ws_avg -5.292e-01 5.738e-02 -9.223 < 2e-16 ***
## I(1/MinDist^2) 2.581e-03 5.113e-04 5.047 4.77e-07 ***
## RefineryMarathon (Carson) 1.916e+00 6.758e-01 2.836 0.004602 **
## RefineryMarathon (Wilmington) -5.750e+00 1.035e+00 -5.556 3.01e-08 ***
## RefineryPhillips 66 (Wilimington) -2.133e+01 4.006e+00 -5.324 1.10e-07 ***
## RefineryPhillips 66 (Wilmington) -2.374e+01 4.854e+00 -4.892 1.05e-06 ***
## RefineryTorrance Refinery 1.102e+01 2.159e+00 5.104 3.54e-07 ***
## RefineryValero Refinery -9.947e+00 2.717e+00 -3.660 0.000257 ***
## Converted_Angle -4.559e-02 9.494e-03 -4.801 1.66e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 5.160 8.000 6.37 <2e-16 ***
## s(latitude,longitude) 1.895 1.895 69.03 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 21/23
## R-sq.(adj) = 0.269 Deviance explained = 27.3%
## GCV = 9.0112 Scale est. = 8.959 n = 2791
plot(h2s_dm_model_c)
h2s_ma_model_a <- gam(H2S_monthly_average~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2)+Refinery, data = full_data)
summary(h2s_ma_model_a)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_monthly_average ~ s(as.numeric(month), bs = "cc") + year +
## wd_sec + ws + I(1/MinDist^2) + Refinery
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.562e+00 4.624e+00 0.987 0.323830
## year2021 1.076e+00 4.618e+00 0.233 0.815796
## year2022 -8.169e+00 4.618e+00 -1.769 0.076930 .
## wd_secQ2 -7.359e-01 1.981e-01 -3.714 0.000204 ***
## wd_secQ3 3.097e+00 2.157e-01 14.360 < 2e-16 ***
## wd_secQ4 -7.004e-01 1.846e-01 -3.794 0.000148 ***
## ws 1.185e-01 2.461e-02 4.815 1.47e-06 ***
## I(1/MinDist^2) 2.964e+04 1.569e+05 0.189 0.850144
## RefineryMarathon (Carson) 2.395e+01 2.453e-01 97.616 < 2e-16 ***
## RefineryMarathon (Wilmington) 4.224e+00 2.706e-01 15.613 < 2e-16 ***
## RefineryPhillips 66 (Wilimington) -2.615e-02 2.499e-01 -0.105 0.916656
## RefineryPhillips 66 (Wilmington) 3.524e+00 2.658e-01 13.262 < 2e-16 ***
## RefineryTorrance Refinery 1.570e+00 2.636e-01 5.956 2.58e-09 ***
## RefineryValero Refinery 6.305e+00 2.632e-01 23.960 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 7.999 8 3996 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0307 Deviance explained = 3.07%
## GCV = 6125.1 Scale est. = 6125 n = 1814982
plot(h2s_ma_model_a)
h2s_ma_model_b <- gam(H2S_monthly_average~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2)+Refinery+s(latitude, longitude, bs='tp', k=10), data = full_data)
summary(h2s_ma_model_b)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_monthly_average ~ s(as.numeric(month), bs = "cc") + year +
## wd_sec + ws + I(1/MinDist^2) + Refinery + s(latitude, longitude,
## bs = "tp", k = 10)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.223e+03 1.439e+01 -85.009 < 2e-16 ***
## year2021 4.242e+00 4.442e+00 0.955 0.339588
## year2022 -5.266e-01 4.442e+00 -0.119 0.905646
## wd_secQ2 4.182e-01 1.951e-01 2.143 0.032118 *
## wd_secQ3 7.243e-01 2.123e-01 3.411 0.000647 ***
## wd_secQ4 -1.549e-01 1.814e-01 -0.854 0.393162
## ws 4.626e-02 2.439e-02 1.896 0.057917 .
## I(1/MinDist^2) 5.255e+07 4.687e+05 112.116 < 2e-16 ***
## RefineryMarathon (Carson) 9.406e+02 1.506e+01 62.463 < 2e-16 ***
## RefineryMarathon (Wilmington) 1.232e+03 1.580e+01 77.956 < 2e-16 ***
## RefineryPhillips 66 (Wilimington) 1.645e+03 1.577e+01 104.304 < 2e-16 ***
## RefineryPhillips 66 (Wilmington) 1.722e+03 1.614e+01 106.675 < 2e-16 ***
## RefineryTorrance Refinery 2.228e+02 1.037e+01 21.490 < 2e-16 ***
## RefineryValero Refinery 1.615e+03 1.703e+01 94.818 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 8 8 5187 <2e-16 ***
## s(latitude,longitude) 2 2 15468 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.144 Deviance explained = 14.4%
## GCV = 5667 Scale est. = 5666.9 n = 1730935
plot(h2s_ma_model_b)
daily_full <- daily_full %>%
mutate(Refinery = str_replace_all(str_replace_all(Refinery, '[()]', ''), ' ', '_'),
Monitor = str_replace_all(Refinery, ' ', '_'))
## Try for a continuous month
tune_grid <- expand.grid(nrounds = c(100, 200),
max_depth = c(2, 3, 4),
eta = c(0.1, 0.3),
gamma = c(0.01, 0.001),
colsample_bytree = c(0.5, 1),
min_child_weight = 0,
subsample = c(0.5, 0.75, 1))
# Run algorithms using 10-fold cross validation
control <- trainControl(method="cv", number=10,verboseIter=TRUE, search='grid')
fit.xgb <- train(H2S_daily_max~.,
method = 'xgbTree',
data = fastDummies::dummy_cols(daily_full[complete.cases(daily_full),] %>%
ungroup() %>%
filter(day >= '2022-02-01') %>%
select(-c(year, day, H2S_daily_avg)),
remove_selected_columns = TRUE)
,
trControl=control,
tuneGrid = tune_grid,
tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
getTrainPerf(fit.xgb)
fit.xgb$finalModel
## ##### xgb.Booster
## raw: 116.3 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.1", max_depth = "3", gamma = "0.01", colsample_bytree = "0.5", min_child_weight = "0", subsample = "0.5", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## xgb.attributes:
## niter
## # of features: 31
## niter: 100
## nfeatures : 31
## xNames : latitude longitude Converted_Angle MinDist ws_avg wd_avg Monitor_Chevron_El_Segundo Monitor_Marathon_Carson Monitor_Marathon_Wilmington Monitor_Phillips_66_Wilimington Monitor_Phillips_66_Wilmington Monitor_Torrance_Refinery Monitor_Valero_Refinery Refinery_Chevron_El_Segundo Refinery_Marathon_Carson Refinery_Marathon_Wilmington Refinery_Phillips_66_Wilimington Refinery_Phillips_66_Wilmington Refinery_Torrance_Refinery Refinery_Valero_Refinery month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 37 100 3 0.1 0.01 0.5 0 0.5
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb,scale=FALSE)
plot(imp, top=10)
xgb.plot.shap(data =
fastDummies::dummy_cols(daily_full[complete.cases(daily_full),] %>%
ungroup() %>%
filter(day >= '2022-02-01') %>%
select(-c(year, day, H2S_daily_avg, H2S_daily_max)),
remove_selected_columns = TRUE) %>%
as.matrix(),
model = fit.xgb$finalModel, top_n = 12, n_col = 3)
## Warning in sqrt(sum.squares/one.delta): NaNs produced
xgb.ggplot.shap.summary(data = fastDummies::dummy_cols(daily_full[complete.cases(daily_full),] %>%
ungroup() %>%
filter(day >= '2022-02-01') %>%
select(-c(year, day, H2S_daily_avg, H2S_daily_max)),
remove_selected_columns = TRUE) %>%
as.matrix(),
model = fit.xgb$finalModel, top_n = 10)